**Project Title: Income and employment for the various county’s in USA*

#census_api_key(Sys.getenv("CENSUS_API_KEY"), install = TRUE)
#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

#install.packages("factoextra")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)

DF1 - EMPLOYMENT

#Abhay
#Employment - K202301 for 2021

# Get ACS data
df1 <- get_acs(geography = "state", 
               table = "K202301", 
               year = 2021, 
               survey = "acs1-year", 
               cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202301 and caching the dataset for faster future access.
variables_df1 <- data.frame(
  variable = c("K202301_001", "K202301_002", "K202301_003", "K202301_004", "K202301_005", "K202301_006", "K202301_007"),
  label = c("Total", "In labor force:", "Civilian labor force:", "Employed", 
            "Unemployed", "In Armed Forces", 
            "Not in labor force")
)
# Join with variable labels


# Now df_labeled will have an additional column with the variable labels
# Load variables for ACS 2022
# Sample mapping of variable codes to labels
# Replace this with the actual codes and labels


# Assuming df1 is your data frame with the estimates you've loaded
# Make sure to replace 'NAME' and 'estimate' with the actual column names from your df1

# Join your data with the variable labels to get the descriptive names
df1_labeled <- df1 %>%
  left_join(variables_df1, by = "variable")

# Reshape the data so that state names are rows and variable labels are columns
df1_wide <- df1_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)


# df_wide now has states as columns and the descriptive variable labels as rows
# # Calculate employment rates
# df1_wide <- df1_wide %>%
#   mutate(
#     EmploymentRate = `In labor force: Civilian labor force: Employed` / `Total` * 100,
#     UnemploymentRate = `In labor force: Civilian labor force: Unemployed` / `Total` * 100,
#     NotInLaborForceRate = `Not in labor force` / `Total` * 100
#   )
# 
# # Plotting employment rates
# library(ggplot2)
# 
# ggplot(df1_wide, aes(x = reorder(NAME, -EmploymentRate), y = EmploymentRate)) +
#   geom_bar(stat = "identity", fill = "steelblue") +
#   labs(x = "State", y = "Employment Rate (%)", title = "Employment Rates by State") +
#   theme_minimal() +
#   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# # Plotting labor force participation
# ggplot(df1_wide, aes(x = NAME)) +
#   geom_bar(aes(y = `In labor force: Civilian labor force: Employed`, fill = "Employed"), stat = "identity") +
#   geom_bar(aes(y = `In labor force: Civilian labor force: Unemployed`, fill = "Unemployed"), stat = "identity") +
#   geom_bar(aes(y = `In labor force: Armed Forces`, fill = "Armed Forces"), stat = "identity") +
#   scale_fill_manual(values = c("Employed" = "green", "Unemployed" = "red", "Armed Forces" = "blue")) +
#   theme(axis.text.x = element_text(angle = 90)) +
#   labs(x = "State", y = "Number of People", fill = "Category", title = "Labor Force Participation by State")
# ggplot(df1_wide, aes(x = UnemploymentRate)) +
#   geom_histogram(binwidth = 1, fill = "tomato", color = "black") +
#   labs(x = "Unemployment Rate (%)", y = "Number of States", title = "Distribution of Unemployment Rates across States")
# # Correlation analysis
# employment_data <- df1_wide %>%
#   select(`In labor force: Civilian labor force: Employed`, `In labor force: Civilian labor force: Unemployed`, `In labor force: Armed Forces`, `Not in labor force`)
# 
# correlation_matrix <- cor(employment_data, use = "complete.obs")
# 
# # Visualize the correlation matrix
# library(corrplot)
# corrplot(correlation_matrix, method = "circle")
# # Calculate employment rate
# # Assuming df1_wide or df6_wide has a column 'state' with state names or abbreviations
# # Calculate the metric you want to plot (for example, Employment Rate)
# 
# 
# # Assuming df1_wide has full state names in the 'NAME' column
# # Convert state names to abbreviations
# state_abbreviations <- state.abb[match(df1_wide$NAME, state.name)]
# df1_wide$state <- state_abbreviations
# 
# # Ensure you have calculated the metric you want to plot (e.g., Employment Rate)
# df1_wide$EmploymentRate <- df1_wide$`In labor force: Civilian labor force: Employed` / df1_wide$Total * 100
# 
# # Plot the map with employment rate
# plot_usmap(data = df1_wide, values = "EmploymentRate", labels = TRUE) +
#   scale_fill_continuous(name = "Employment Rate (%)", label = scales::percent_format()) +
#   theme(legend.position = "right") +
#   labs(title = "Employment Rate by State in 2021")


DF6 - Disabilities

#Abhay
#Disabilities - K201803
# Get ACS data for the Disabilities dataset
df6 <- get_acs(geography = "state", 
               table = "K201803", 
               year = 2021, 
               survey = "acs1-year", 
               cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201803 and caching the dataset for faster future access.
# Assuming the variables_df6 mapping is similar to variables_df1 but for the Disabilities table
# You need to create variables_df6 with the correct variable codes and labels for the Disabilities data
variables_df6 <- data.frame(
  variable = c("K201803_001", "K201803_002", "K201803_003", "K201803_004", "K201803_005", "K201803_006", "K201803_007","K201803_008","K201803_009"),
  label = c("Total people", "Total With Disabilities", 
            "Hearing", "Vision difficulty", 
            "cognative", "ambulatory difficulty", 
            "Self-care difficulty","Independent living difficulty","No Disability")
)

# Join your data with the variable labels to get the descriptive names
df6_labeled <- df6 %>%
  left_join(variables_df6, by = "variable")

# Reshape the data so that state names are rows and variable labels are columns
df6_wide <- df6_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df6_wide
## # A tibble: 52 × 10
##    NAME        `Total people` Total With Disabilit…¹ Hearing `Vision difficulty`
##    <chr>                <dbl>                  <dbl>   <dbl>               <dbl>
##  1 Alabama            4957633                 808071  208028              152798
##  2 Alaska              702154                  92390   33397               15748
##  3 Arizona            7174053                 972252  298849              180792
##  4 Arkansas           2974701                 517051  142133              105624
##  5 California        38724294                4324355 1140131              844049
##  6 Colorado           5715497                 640346  211803              120570
##  7 Connecticut        3557526                 427014  113490               78078
##  8 Delaware            987964                 130551   37933               25335
##  9 District o…         659979                  76754   14429               14569
## 10 Florida           21465883                2906367  812248              555361
## # ℹ 42 more rows
## # ℹ abbreviated name: ¹​`Total With Disabilities`
## # ℹ 5 more variables: cognative <dbl>, `ambulatory difficulty` <dbl>,
## #   `Self-care difficulty` <dbl>, `Independent living difficulty` <dbl>,
## #   `No Disability` <dbl>
# df6_wide now has states as rows and the descriptive variable labels as columns
# Calculate percentages
df6_wide <- df6_wide %>%
  mutate(DisabilityRate = `Total With Disabilities` / `Total people` * 100)

# Plotting with ggplot2
library(ggplot2)

ggplot(df6_wide, aes(x = reorder(NAME, -DisabilityRate), y = DisabilityRate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(x = "State", y = "Disability Rate (%)", title = "Disability Rates by State") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Select only the disability-related columns for correlation
disability_data <- df6_wide %>%
  select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)

# Compute correlation matrix
correlation_matrix <- cor(disability_data, use = "complete.obs")  # use = "complete.obs" handles missing values

# Check if corrplot is installed; if not, install it
if (!require(corrplot)) install.packages("corrplot")
## Loading required package: corrplot
## corrplot 0.92 loaded
library(corrplot)
corrplot(correlation_matrix, method = "circle")

# Comparative analysis of disabilities within a state (example: Iowa)
iowa_data <- df6_wide %>%
  filter(NAME == "Iowa") %>%
 select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)

# Plotting the data for Iowa
barplot(as.matrix(iowa_data), beside = TRUE, legend.text = TRUE, args.legend = list(x = "topright"))

# Predicting 'Ambulatory difficulty' based on other disabilities
lm_model <- lm(`ambulatory difficulty` ~ `Hearing` + `Vision difficulty` + `cognative` + `Self-care difficulty` + `Independent living difficulty`, data = df6_wide)

# Summary of the linear model
summary(lm_model)
## 
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` + 
##     cognative + `Self-care difficulty` + `Independent living difficulty`, 
##     data = df6_wide)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -56621 -11941  -2766  17524  73868 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -8200.5856  5646.4977  -1.452 0.153197    
## Hearing                             0.6796     0.1793   3.791 0.000436 ***
## `Vision difficulty`                 1.0070     0.1644   6.126 1.87e-07 ***
## cognative                          -0.5426     0.2291  -2.369 0.022116 *  
## `Self-care difficulty`             -1.9464     0.4434  -4.390 6.59e-05 ***
## `Independent living difficulty`     1.9228     0.3144   6.115 1.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24940 on 46 degrees of freedom
## Multiple R-squared:  0.9968, Adjusted R-squared:  0.9965 
## F-statistic:  2909 on 5 and 46 DF,  p-value: < 2.2e-16
lm_model
## 
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` + 
##     cognative + `Self-care difficulty` + `Independent living difficulty`, 
##     data = df6_wide)
## 
## Coefficients:
##                     (Intercept)                          Hearing  
##                      -8200.5856                           0.6796  
##             `Vision difficulty`                        cognative  
##                          1.0070                          -0.5426  
##          `Self-care difficulty`  `Independent living difficulty`  
##                         -1.9464                           1.9228
# K-means clustering
#Perform a clustering analysis to identify groups of states with similar disability profiles.


# K-means clustering with 3 centers
set.seed(123)  # For reproducibility
clusters <- kmeans(disability_data, centers = 3)

# Add cluster assignment to the data
df6_wide$Cluster <- as.factor(clusters$cluster)

pca_res <- prcomp(disability_data, scale. = TRUE)
fviz_pca_biplot(pca_res, label = "none", col.ind = df6_wide$Cluster, addEllipses = TRUE)



DF 2 - Education

#Neha
#Education - K201501
df2 <- get_acs(geography = "state", 
                table = "K201501", 
                year = 2021, 
                survey = "acs1/subject",cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201501 and caching the dataset for faster future access.
variables_df2 <- data.frame(
  variable = c("K201501_001", "K201501_002", "K201501_003", "K201501_004", "K201501_005", "K201501_006", "K201501_007", "K201501_008"),
  label = c("Education_Total_students", "Education_Below_9th grade", "Education_9th to 12th grade_no diploma", "Education_High_school_graduate", "Education_Some college_no degree", "Education_Associate's_degree", "Education_Bachelor's_degree", "Education_Graduate_professional degree")
)

df2_labeled <- df2 %>%
  left_join(variables_df2, by = "variable")

df2_wide <- df2_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)


DF3 - Citizenship

#Esha
#Citizenship - K200501

df3<- get_acs(
  geography = "state", 
  table = "K200501", 
  year = 2021, 
  survey = "acs1/subject", 
  cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200501 and caching the dataset for faster future access.
variables_df3 <- data.frame(
  variable = c("K200501_001", "K200501_002","K200501_003"),  
  label = c("Total","U.S. citizen", "Not a U.S. citizen")  
)

df3_labeled <- df3 %>%
  left_join(variables_df3, by = "variable")

df3_wide <- df3_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

***


DF4 - Age

#Srika
#Age - K200104
df4 <- get_acs(geography = "state", 
                table = "K200104", 
                year = 2021,
                survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
variables_df4 <- data.frame(
  variable = c("K200104_001", "K200104_002", "K200104_003", "K200104_004", "K200104_005", "K200104_006", "K200104_007", "K200104_008"),

  label = c("Total", "Age_under_18", 
            "Age_18_to_24", "Age_25_to_34", 
            "Age_35_to_44", "Age_45_to_54", "Age_55_to_64","Age_over_64")
)

df4_labeled <- df4 %>%
  left_join(variables_df4, by = "variable")

df4_wide <- df4_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::map()    masks maps::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
Age_data <- get_acs(geography = "state", 
                table = "K200104", 
                year = 2021, 
               geometry = TRUE,
               resolution = "20m",
                survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |======================================================================| 100%
Mapping_age<-Age_data%>%
  filter(variable=="K200104_007")%>%
  shift_geometry()
ggplot(data = Mapping_age, aes(fill = estimate)) + 
  geom_sf()+
  scale_fill_distiller(palette = "RdPu", 
                       direction = 1) + 
  labs(title = "Median Age by State, 2021",
       caption = "Data source: 2021 1-year ACS, US Census Bureau",
       fill = "ACS estimate") + 
  theme_void()



DF5 - Housing

#Niharika
# Housing - K202502
df5 <- get_acs(geography = "state", 
                table = "K202502", 
                year = 2021, 
                survey = "acs1", # removed /subject since we're referring to a specific table ID
                cache_table = TRUE)
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202502 and caching the dataset for faster future access.
# Define the variable codes and their respective labels for the housing dataset (df5)
variables_df5 <- data.frame(
  variable = c("K202502_001", "K202502_002", "K202502_003"),
  label = c("Total", "Owner Occupied", "Renter Occupied")
)

# Assuming df5 is your housing dataset
# Reshape the data so that state names are rows
df5_labeled <- df5 %>%
  left_join(variables_df5, by = "variable")  # Joining with the variable labels dataframe

# Reshape the data to have state names as rows and variable labels as columns
df5_wide <- df5_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

# df5_wide will have states as rows and different housing-related variables as columns

Plan for Data Exploration

merged_data <- df1_wide %>%
  left_join(df2_wide, by = "NAME") %>%
  left_join(df3_wide, by = "NAME")%>%
  left_join(df4_wide, by = "NAME")%>%
  left_join(df5_wide, by = "NAME")%>%
  left_join(df6_wide, by = "NAME")


columns_to_drop <- c('Total.x', 'Total.y', 'Total.x.x', 'Total.y.y')
merged_data <- merged_data[, !(names(merged_data) %in% columns_to_drop)]

colnames(merged_data)
##  [1] "NAME"                                  
##  [2] "In labor force:"                       
##  [3] "Civilian labor force:"                 
##  [4] "Employed"                              
##  [5] "Unemployed"                            
##  [6] "In Armed Forces"                       
##  [7] "Not in labor force"                    
##  [8] "Education_Total_students"              
##  [9] "Education_Below_9th grade"             
## [10] "Education_9th to 12th grade_no diploma"
## [11] "Education_High_school_graduate"        
## [12] "Education_Some college_no degree"      
## [13] "Education_Associate's_degree"          
## [14] "Education_Bachelor's_degree"           
## [15] "Education_Graduate_professional degree"
## [16] "U.S. citizen"                          
## [17] "Not a U.S. citizen"                    
## [18] "Age_under_18"                          
## [19] "Age_18_to_24"                          
## [20] "Age_25_to_34"                          
## [21] "Age_35_to_44"                          
## [22] "Age_45_to_54"                          
## [23] "Age_55_to_64"                          
## [24] "Age_over_64"                           
## [25] "Owner Occupied"                        
## [26] "Renter Occupied"                       
## [27] "Total people"                          
## [28] "Total With Disabilities"               
## [29] "Hearing"                               
## [30] "Vision difficulty"                     
## [31] "cognative"                             
## [32] "ambulatory difficulty"                 
## [33] "Self-care difficulty"                  
## [34] "Independent living difficulty"         
## [35] "No Disability"                         
## [36] "DisabilityRate"                        
## [37] "Cluster"

************************

Title

“Socio-Economic Factors Influencing Employment in the United States: A Comprehensive State-by-State Analysis”

Thesis

This project aims to analyze the impact of various socio-economic factors such as education, age, citizenship status, housing, and disabilities on employment rates across different states in the United States. Using data from the American Community Survey (ACS) 2021, the study will employ statistical techniques to identify correlations, trends, and patterns, thereby providing insights into the multifaceted nature of employment dynamics in the US.

Roadmap and Code for Next Steps

  1. Data Exploration and Cleaning

    • Continue with data cleaning and handling missing values.

    • Normalize the data if required for better comparison.

# Checking for missing values
sum(is.na(merged_data))
## [1] 2
# Handling missing values (example: replacing with median)
merged_data <- merged_data %>% 
  mutate_at(vars(-group_cols()), ~ifelse(is.na(.), median(., na.rm = TRUE), .))

# Normalizing the data (example: Min-Max Scaling)
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

merged_data_normalized <- as.data.frame(lapply(merged_data[, sapply(merged_data, is.numeric)], normalize))
  1. Exploratory Data Analysis (EDA)

    • Generate summary statistics and visualizations for each dataset to understand distributions and detect outliers.

    • Explore the relationships between different variables using scatter plots, box plots, etc.

# Summary Statistics
summary(merged_data)
##      NAME           In labor force:    Civilian labor force:    Employed       
##  Length:52          Min.   :  301303   Min.   :  298603      Min.   :  287432  
##  Class :character   1st Qu.:  897042   1st Qu.:  892926      1st Qu.:  845392  
##  Mode  :character   Median : 2134618   Median : 2113270      Median : 1975060  
##                     Mean   : 3259412   Mean   : 3233640      Mean   : 3028193  
##                     3rd Qu.: 3897332   3rd Qu.: 3877097      3rd Qu.: 3627645  
##                     Max.   :19961610   Max.   :19805371      Max.   :18156051  
##    Unemployed      In Armed Forces  Not in labor force Education_Total_students
##  Min.   :  11171   Min.   :  1313   Min.   :  160319   Min.   :  395348        
##  1st Qu.:  47280   1st Qu.:  4381   1st Qu.:  531030   1st Qu.: 1263471        
##  Median : 132957   Median : 12108   Median : 1463585   Median : 3054251        
##  Mean   : 205447   Mean   : 25771   Mean   : 1930062   Mean   : 4434517        
##  3rd Qu.: 216516   3rd Qu.: 25365   3rd Qu.: 2270377   3rd Qu.: 5085510        
##  Max.   :1649320   Max.   :156239   Max.   :11545627   Max.   :26909869        
##  Education_Below_9th grade Education_9th to 12th grade_no diploma
##  Min.   :   7224           Min.   :  17461                       
##  1st Qu.:  40727           1st Qu.:  65815                       
##  Median : 108208           Median : 173250                       
##  Mean   : 214804           Mean   : 261375                       
##  3rd Qu.: 212026           3rd Qu.: 294885                       
##  Max.   :2380657           Max.   :1804222                       
##  Education_High_school_graduate Education_Some college_no degree
##  Min.   :  70925                Min.   :  56221                 
##  1st Qu.: 335012                1st Qu.: 263017                 
##  Median : 813017                Median : 618456                 
##  Mean   :1166698                Mean   : 852411                 
##  3rd Qu.:1441058                3rd Qu.:1014627                 
##  Max.   :5578997                Max.   :5287901                 
##  Education_Associate's_degree Education_Bachelor's_degree
##  Min.   :  15235              Min.   :  73255            
##  1st Qu.: 123095              1st Qu.: 243453            
##  Median : 273742              Median : 547876            
##  Mean   : 389473              Mean   : 941835            
##  3rd Qu.: 460085              3rd Qu.:1259927            
##  Max.   :2120275              Max.   :5958030            
##  Education_Graduate_professional degree  U.S. citizen      Not a U.S. citizen
##  Min.   :  42363                        Min.   :  567083   Min.   :  11069   
##  1st Qu.: 158678                        1st Qu.: 1820268   1st Qu.:  66793   
##  Median : 358179                        Median : 4404876   Median : 148529   
##  Mean   : 607922                        Mean   : 6059088   Mean   : 411049   
##  3rd Qu.: 863754                        3rd Qu.: 6893788   3rd Qu.: 423286   
##  Max.   :3779787                        Max.   :34499330   Max.   :4738506   
##   Age_under_18      Age_18_to_24      Age_25_to_34      Age_35_to_44    
##  Min.   : 116767   Min.   :  51926   Min.   :  70511   Min.   :  78268  
##  1st Qu.: 442838   1st Qu.: 169060   1st Qu.: 235319   1st Qu.: 245703  
##  Median : 986696   Median : 402442   Median : 589334   Median : 584114  
##  Mean   :1423482   Mean   : 587334   Mean   : 874641   Mean   : 848734  
##  3rd Qu.:1629658   3rd Qu.: 683308   3rd Qu.:1039599   3rd Qu.: 961656  
##  Max.   :8769779   Max.   :3558188   Max.   :5827869   Max.   :5414686  
##   Age_45_to_54      Age_55_to_64      Age_over_64      Owner Occupied   
##  Min.   :  65531   Min.   :  66238   Min.   :  85615   Min.   : 132936  
##  1st Qu.: 216349   1st Qu.: 238707   1st Qu.: 320627   1st Qu.: 524154  
##  Median : 524686   Median : 559544   Median : 767373   Median :1143470  
##  Mean   : 790186   Mean   : 831869   Mean   :1089087   Mean   :1619184  
##  3rd Qu.: 888609   3rd Qu.: 958270   3rd Qu.:1272226   3rd Qu.:1912862  
##  Max.   :4928928   Max.   :4773860   Max.   :5964526   Max.   :7502706  
##  Renter Occupied    Total people      Total With Disabilities    Hearing       
##  Min.   :  69516   Min.   :  570046   Min.   :  76754         Min.   :  14429  
##  1st Qu.: 192915   1st Qu.: 1848408   1st Qu.: 257303         1st Qu.:  84763  
##  Median : 569181   Median : 4317403   Median : 660485         Median : 185961  
##  Mean   : 856022   Mean   : 6349048   Mean   : 830722         Mean   : 226790  
##  3rd Qu.:1032259   3rd Qu.: 7285134   3rd Qu.: 979094         3rd Qu.: 288780  
##  Max.   :5926357   Max.   :38724294   Max.   :4324355         Max.   :1140131  
##  Vision difficulty   cognative       ambulatory difficulty Self-care difficulty
##  Min.   : 12915    Min.   :  27746   Min.   :  33324       Min.   : 10601      
##  1st Qu.: 48799    1st Qu.: 104446   1st Qu.: 109640       1st Qu.: 43107      
##  Median :119128    Median : 261010   Median : 291072       Median :114600      
##  Mean   :158917    Mean   : 323599   Mean   : 400503       Mean   :154262      
##  3rd Qu.:193366    3rd Qu.: 374062   3rd Qu.: 462491       3rd Qu.:171742      
##  Max.   :844049    Max.   :1710305   Max.   :2094886       Max.   :972688      
##  Independent living difficulty No Disability      DisabilityRate 
##  Min.   :  23311               Min.   :  490741   Min.   :10.35  
##  1st Qu.:  77948               1st Qu.: 1570278   1st Qu.:12.05  
##  Median : 229318               Median : 3608154   Median :13.46  
##  Mean   : 296658               Mean   : 5518325   Mean   :13.75  
##  3rd Qu.: 346728               3rd Qu.: 6306040   3rd Qu.:14.38  
##  Max.   :1741491               Max.   :34399939   Max.   :22.01  
##     Cluster     
##  Min.   :1.000  
##  1st Qu.:1.000  
##  Median :2.000  
##  Mean   :1.712  
##  3rd Qu.:2.000  
##  Max.   :3.000
# Load necessary library
library(ggplot2)
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
# Loop through each column in the dataframe
for (i in names(merged_data)) {
  # Convert column name to a symbol
  col_sym <- sym(i)

  # Check if the column is numeric for histograms
  if (is.numeric(merged_data[[i]])) {
    p <- ggplot(merged_data, aes(x = !!col_sym)) + 
      geom_histogram(bins = 30, fill = "blue", color = "black") +
      theme_minimal() +
      labs(title = paste("Histogram of", i), x = i, y = "Frequency")
    
  # If not numeric, use bar plot for categorical data
  } else {
    p <- ggplot(merged_data, aes(x = !!col_sym)) +
      geom_bar(fill = "blue", color = "black") +
      theme_minimal() +
      labs(title = paste("Bar Plot of", i), x = i, y = "Count") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x labels for readability
  }

  # Print the plot
  print(p)
}

  1. Correlation Analysis

    • Calculate correlation matrices for the variables related to employment, education, age, etc., to identify strong correlations.
  2. Statistical Testing

    • Conduct hypothesis testing (e.g., t-tests, ANOVA) to determine if differences in employment rates across different groups (like age groups, educational levels) are statistically significant.
  3. Predictive Modeling

    • Build regression models to predict employment rates based on other variables.

    • Explore other predictive models if applicable (e.g., decision trees, random forests).

  4. Clustering Analysis

    • Use k-means or hierarchical clustering to find patterns or groups in the data based on socio-economic factors.
  5. Data Visualization and Mapping

    • Create comprehensive visualizations and maps to represent the findings. For instance, use ggplot2 and usmap for spatial data representation.
  6. Interpretation and Conclusion

    • Interpret the results from the statistical analyses and models.

    • Draw conclusions about the socio-economic factors affecting employment.

  7. Reporting

    • Compile the results and interpretations into a final report or presentation.